In [None]:
%matplotlib widget

import ipywidgets
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

plt.ioff();

# Regresión lineal

Consideremos los siguientes datos de entrenamiento en un problema de regresión lineal con modelos simples uniparamétricos y pérdida cuadrática.

| $x$ | $y$ |
|-----|-----|
|  1  |  4  |
|  2  |  8  |
|  5  |  5  |
|  6  |  6  |
|  7  |  7  |
|  8  |  8  |

$$\mathcal{F} = \left\{ f_\mathbf{w} \mid \mathbf{w}\in\mathbb{R}, f_\mathbf{w}(x) = \mathbf{w}\cdot\phi(x), \phi(x) = x\right\}$$

$$\text{Loss}(x, y, \mathbf{w}) = (f_\mathbf{w}(x) - y)^2$$

In [None]:
Dtrain = [
    (1, 4),
    (2, 8),
    (5, 5),
    (6, 6),
    (7, 7),
    (8, 8),
]

In [None]:
def features(x):
    return x

In [None]:
def predict(w, x):
    return w * features(x)

In [None]:
def loss(x, y, w):
    return (predict(w, x) - y) ** 2

In [None]:
plt.close()

xmin, xmax = 0, 8
ymin, ymax = 0, 8

w_init, w_min, w_max, w_step = 0.5, 0.0, 20.0, 0.01

def plot_Dtrain(ax):
    xs, ys = [], []
    for x, y in Dtrain:
        xs.append(x)
        ys.append(y)
    return ax.scatter(xs, ys, c = "black")

def plot_predictor(ax):
    xs = [xmin, xmax]
    ys = [predict(w_init, x) for x in xs]
    return ax.plot(xs, ys, c = "black")

fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot()
plot_Dtrain(ax)
lines = plot_predictor(ax)
ax.set_xlabel("$x$", size=20)
ax.set_xlim((xmin, xmax + 1))
ax.set_ylabel("$y$", size=20)
ax.set_ylim((ymin, ymax + 1))
fig.canvas.header_visible = False

def update_plot(w):
    xs = ax.get_xlim()
    ys = [predict(w, x) for x in xs]
    lines[0].set_data(xs, ys)
    fig.canvas.draw()
    fig.canvas.flush_events()

widget = ipywidgets.interactive(
    update_plot,
    w = ipywidgets.FloatSlider(
        orientation = "horizontal",
        description = "w",
              value = w_init,
                min = w_min,
                max = w_max,
               step = w_step,
             layout = ipywidgets.Layout(width='90%')
    ),
)

display(widget)
display(fig.canvas);

## Pérdida promedio

En los problemas de regresión, utilizamos la pérdida promedio para plantear un problema de optimización que nos permita encontrar el mejor predictor utilizando descenso de gradiente.

$$\begin{aligned}
\text{TrainLoss}(\mathbf{w}) &= \frac{1}{|\mathcal{D}_\text{train}|} \sum_{(x,y)\in\mathcal{D}_\text{train}} \text{Loss}(x, y, \mathbf{w}) \\
\nabla_\mathbf{w} \text{TrainLoss}(\mathbf{w}) &= \frac{1}{|\mathcal{D}_\text{train}|} \sum_{(x,y)\in\mathcal{D}_\text{train}} \nabla_\mathbf{w}\text{Loss}(x, y, \mathbf{w}) \\
&= \frac{1}{|\mathcal{D}_\text{train}|} \sum_{(x,y)\in\mathcal{D}_\text{train}} \nabla_\mathbf{w}(f_\mathbf{w}(x) - y)^2 \\
&= \frac{1}{|\mathcal{D}_\text{train}|} \sum_{(x,y)\in\mathcal{D}_\text{train}} 2(f_\mathbf{w}(x) - y)\phi(x)
\end{aligned}$$

Aunque también podemos encontrar el valor de $\mathbf{w}$ que minimiza la pérdida de entrenamiento analíticamente...

$$\begin{aligned}
\nabla_\mathbf{w}\text{TrainLoss}(\mathbf{w}) &= \frac{1}{6}\sum_{(x,y)\in\mathcal{D}_\text{train}} 2(\mathbf{w}\cdot\phi(x)-y)\phi(x) \\
&= \frac{1}{6}\sum_{(x,y)\in\mathcal{D}_\text{train}} 2(\mathbf{w}\cdot x-y)x \\
&= \frac{2}{6}\sum_{(x,y)\in\mathcal{D}_\text{train}} (\mathbf{w}\cdot x-y)x \\
&= \frac{1}{3}\left[ (\mathbf{w}\cdot 1-4)1 + (\mathbf{w}\cdot 2-8)2 + (\mathbf{w}\cdot 5-5)5 + (\mathbf{w}\cdot 6-6)6 + (\mathbf{w}\cdot 7-7)7 + (\mathbf{w}\cdot 8-8)8  \right] \\
&= \frac{1}{3}\left[ (\mathbf{w}-4) + (4\mathbf{w}-16) + (25\mathbf{w}-25) + (36\mathbf{w}-36) + (49\mathbf{w}-49) + (64\mathbf{w}-64)  \right] \\
&= \frac{1}{3}\left[ \mathbf{w}-4 + 4\mathbf{w}-16 + 25\mathbf{w}-25 + 36\mathbf{w}-36 + 49\mathbf{w}-49 + 64\mathbf{w}-64  \right] \\
&= \frac{1}{3}\left[ (\mathbf{w} + 4\mathbf{w} + 25\mathbf{w} + 36\mathbf{w} + 49\mathbf{w} + 64\mathbf{w}) + (-4-16-25-36-49-64)  \right] \\
&= \frac{1}{3}\left[ 179\mathbf{w}-194  \right] \\
&= \frac{179}{3}\mathbf{w}-\frac{194}{3}
\end{aligned}$$

Encontramos el peso $\mathbf{w}_\min$ que minimiza $\text{TrainLoss}$ encontrando la raíz del gradiente de $\text{TrainLoss}$.

$$\begin{aligned}
\nabla_\mathbf{w}\text{TrainLoss}(\mathbf{w_\min}) &= 0 \\
\frac{179}{3}\mathbf{w_\min}-\frac{194}{3} &= 0 \\
\frac{179}{3}\mathbf{w_\min} &= \frac{194}{3} \\
\mathbf{w_\min} &= \frac{194\cdot3}{3\cdot179} \\
                &= \frac{582}{537} \\
                &= 1.083798882681564245810055866
\end{aligned}$$

In [None]:
def train_loss(Dtrain, w):
    examples = len(Dtrain)
    total = sum(loss(x, y, w) for x, y in Dtrain)
    return total / examples

In [None]:
def grad_train_loss(Dtrain, w):
    examples = len(Dtrain)
    total = sum(2*(predict(w, x)-y)*features(x) for x, y in Dtrain)
    return total / examples

In [None]:
def gradient_descent(Dtrain, eta, T):
    w = 0.0
    for t in range(1, T + 1):
        tl = train_loss(Dtrain, w)
        gtl = grad_train_loss(Dtrain, w)
        w = w - eta * gtl
    return w

In [None]:
w_best = gradient_descent(Dtrain, 0.01, 100)

In [None]:
plt.close()

w_min, w_max, w_step = 0.0, 5.0, 0.1
l_min, l_max = 0, 80

fig = plt.figure();
ax = fig.add_subplot()
xs = np.arange(w_min, w_max, w_step)
ys = [train_loss(Dtrain, w) for w in xs]
ax.plot(xs, ys, c = "black", label = "$\\text{TrainLoss}$")
ax.scatter([w_best], [train_loss(Dtrain, w_best)], c = "black", s = 30, label = "$\\mathbf{w}_\\min =$" + str(w_best)[:5])
ax.set_xlabel("$\\mathbf{w}$")
ax.set_xlim((w_min, w_max))
ax.set_ylabel("$\\text{loss}$")
ax.set_ylim((l_min, l_max))
ax.legend()
fig.canvas.header_visible = False

display(fig.canvas);

In [None]:
plt.close()

xmin, xmax = 0, 8
ymin, ymax = 0, 8

w_init, w_min, w_max, w_step = w_best, 0.0, 20.0, 0.01

def plot_Dtrain(ax):
    xs, ys = [], []
    for x, y in Dtrain:
        xs.append(x)
        ys.append(y)
    return ax.scatter(xs, ys, c = "black")

def plot_predictor(ax):
    xs = [xmin, xmax]
    ys = [predict(w_init, x) for x in xs]
    return ax.plot(xs, ys, c = "black")
    

plt.ioff()

fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot()
plot_Dtrain(ax)
lines = plot_predictor(ax)
ax.set_xlabel("$x$", size=20)
ax.set_xlim((xmin, xmax + 1))
ax.set_ylabel("$y$", size=20)
ax.set_ylim((ymin, ymax + 1))
fig.canvas.header_visible = False

display(fig.canvas);

¡Perfecto! Hemos encontrado el mejor predictor para los datos


<img style="width:100%; padding-top:10rem" src="https://media.giphy.com/media/v1.Y2lkPTc5MGI3NjExa21ydGw3cWFuZzd4a3Ztd2hxMGEwc3gwazgycHQ0bnp3ODRxb29uZiZlcD12MV9pbnRlcm5hbF9naWZfYnlfaWQmY3Q9Zw/NVEtOZRkhukZVBB32W/giphy.gif"></img>

Los datos presentan dos comportamientos lineales diferentes.

Un comportamiento lineal corresponde a una minoría de los ejemplos y el otro a una mayoría.

Se han encontrado este tipo de discrepancias en datos de entrenamiento utilizados en la vida real.

Puedes conocer más al respecto en el proyecto [Gender Shades](http://gendershades.org/).

Los modelos de aprendizaje máquina, de ser entrenados y utilizados ingenuamente, pueden desfavorecer a ciertos grupos, usualmente minorías. También esto ocurre al incorporar pocos datos de entrenamiento para clases protegidas ya sea en cuestión de sexo, género, etnicidad, estatus socioeconómico, ubicación geográfica, entre otras.  Las predicciones realizadas y el uso que les damos los humanos tienen [serias consecuencias](https://youtu.be/qOEtQYGwZ8w?si=CLka51Wsg05Rjglx).

# Regresión lineal con grupos


Identifiquemos los dos comportamientos lineales con una tercer columna de grupo.

| $x$ | $y$ | $g$ |
|-----|-----|-----|
|  1  |  4  |  A  |
|  2  |  8  |  A  |
|  5  |  5  |  B  |
|  6  |  6  |  B  |
|  7  |  7  |  B  |
|  8  |  8  |  B  |

In [None]:
DtrainG = [
    (1, 4, 'A'),
    (2, 8, 'A'),
    (5, 5, 'B'),
    (6, 6, 'B'),
    (7, 7, 'B'),
    (8, 8, 'B'),
]

In [None]:
def Dtrain_groups(Dtrain):
    return set(g for _, _, g in Dtrain)

def DtrainGroup(Dtrain, group):
    return [(x, y) for (x, y, g) in Dtrain if group == g]

In [None]:
plt.close()

xmin, xmax = 0, 8
ymin, ymax = 0, 8

w_init, w_min, w_max, w_step = w_best, 0.0, 20.0, 0.01

def plot_Dtrain(ax):
    xs1, ys1, xs2, ys2 = [], [], [], []
    for x, y, g in DtrainG:
        if g == "A":
            xs1.append(x)
            ys1.append(y)
        else:
            xs2.append(x)
            ys2.append(y)
    return [
        ax.scatter(xs1, ys1, c = "orange", label="A"),
        ax.scatter(xs2, ys2, c = "purple", label="B"),
    ]

def plot_predictor(ax):
    xs = [xmin, xmax]
    ys = [predict(w_init, x) for x in xs]
    return ax.plot(xs, ys, c = "black")
    

plt.ioff()

fig = plt.figure(figsize=(5,5));
ax = plt.axes();
plot_Dtrain(ax)
lines = plot_predictor(ax)
ax.set_xlabel("$x$", size=20)
ax.set_xlim((xmin, xmax + 1))
ax.set_ylabel("$y$", size=20)
ax.set_ylim((ymin, ymax + 1))
ax.legend()
fig.canvas.header_visible = False

display(fig.canvas);

¿A qué grupo favorece más el “mejor” predictor?

## Pérdida por grupo

Consideremos la misma pérdida cuadrada, pero ahora tomamos la pérdida de entrenamiento por cada grupo.

Veamos la pérdida de entrenamiento promedio por cada grupo...

In [None]:
plt.close()

w_min, w_max, w_step = 0.0, 5.0, 0.1
l_min, l_max = 0, 80

fig = plt.figure();
ax = fig.add_subplot()
xs = np.arange(w_min, w_max, w_step)
ys1 = [train_loss(DtrainGroup(DtrainG, "A"), w) for w in xs]
ys2 = [train_loss(DtrainGroup(DtrainG, "B"), w) for w in xs]
ax.plot(xs, ys1, c = "orange", label = "$\\text{TrainLoss}_A$")
ax.plot(xs, ys2, c = "purple", label = "$\\text{TrainLoss}_B$")
ax.set_xlabel("$\\mathbf{w}$")
ax.set_xlim((w_min, w_max))
ax.set_ylabel("$\\text{loss}$")
ax.set_ylim((l_min, l_max))
ax.legend()
fig.canvas.header_visible = False

display(fig.canvas);

Si usamos la $\mathbf{w}_\min$ calculada previamente, la pérdida de entrenamiento promedio para el grupo A es

In [None]:
train_loss(DtrainGroup(DtrainG, "A"), w_best)

Mientras que la pérdida de entrenamiento promedio para el grupo B es

In [None]:
train_loss(DtrainGroup(DtrainG, "B"), w_best)

Aunque la pérdida de entrenamiento promedio sobre todo el conjunto de entrenamiento fue

In [None]:
train_loss(Dtrain, w_best)

hay una disparidad muy grande en el rendimiento entre los grupos, donde el grupo A es el más afectado.

## Pérdida máxima de grupo

Queremos capturar las pérdidas por grupo con un solo valor. Para hacer esto vamos a considerar el peor caso sobre todos los grupos.

Definimos la **pérdida máxima de grupo** como la pérdida máxima sobre todos cada grupo $g$

$$\text{TrainLoss}_\max(\mathbf{w}) = \max_g \text{TrainLoss}_g(\mathbf{w})$$

In [None]:
def train_loss_max(Dtrain, w):
    gs = Dtrain_groups(Dtrain)
    return max(
        ((train_loss(DtrainGroup(Dtrain, g), w), g) for g in gs),
        key = lambda p: p[0],
    )

In [None]:
train_loss_max(DtrainG, w_best)

In [None]:
def grad_train_loss_max(Dtrain, w):
    tl, g = train_loss_max(Dtrain, w)
    return grad_train_loss(DtrainGroup(Dtrain, g), w)

In [None]:
def gradient_descent_max(Dtrain, eta, T):
    w = 0.0
    for t in range(1, T + 1):
        gtl = grad_train_loss_max(Dtrain, w)
        w = w - eta * gtl
    return w

In [None]:
w_gbest = gradient_descent_max(DtrainG, 0.001, 100)

In [None]:
plt.close()

w_min, w_max, w_step = 0.0, 5.0, 0.01
l_min, l_max = -1, 80

fig = plt.figure(figsize=(8,5));
ax = fig.add_subplot()
xs = np.arange(w_min, w_max, w_step)
ys0 = [train_loss(Dtrain, w) for w in xs]
ys1 = [train_loss(DtrainGroup(DtrainG, "A"), w) for w in xs]
ys2 = [train_loss(DtrainGroup(DtrainG, "B"), w) for w in xs]
ys3 = [train_loss_max(DtrainG, w)[0] for w in xs]
ax.plot(xs, ys0, c = "black", label = "$\\text{TrainLoss}$")
ax.plot(xs, ys1, c = "orange", label = "$\\text{TrainLoss}_A$")
ax.plot(xs, ys2, c = "purple", label = "$\\text{TrainLoss}_B$")
ax.plot(xs, ys3, c = "blue", label = "$\\text{TrainLoss}_\\max$")

w_best = gradient_descent(Dtrain, 0.01, 1000)
ax.scatter([w_best], [train_loss(Dtrain, w_best)],
           c = "black", s = 30, label = "$\\mathbf{w}_\\min =$" + str(round(w_best, 3)))

w_A = gradient_descent(DtrainGroup(DtrainG, "A"), 0.01, 1000)
ax.scatter([w_A], [train_loss(DtrainGroup(DtrainG, "A"), w_A)],
           c = "orange", s = 30, label = "$\\mathbf{w}_A =$" + str(round(w_A, 3)))

w_B = gradient_descent(DtrainGroup(DtrainG, "B"), 0.01, 1000)
ax.scatter([w_B], [train_loss(DtrainGroup(DtrainG, "B"), w_B)],
           c = "purple", s = 30, label = "$\\mathbf{w}_B =$" + str(round(w_B, 3)))

w_gbest = gradient_descent_max(DtrainG, 0.001, 1000)
ax.scatter([w_gbest], [train_loss_max(DtrainG, w_gbest)[0]],
           c = "blue", s = 30, label = "$\\mathbf{w}_{g\\min} =$" + str(round(w_gbest, 3)))

ax.set_xlabel("$\\mathbf{w}$")
ax.set_xlim((w_min, w_max))
ax.set_ylabel("$\\text{loss}$")
ax.set_ylim((l_min, l_max))
ax.legend()
fig.canvas.header_visible = False

display(fig.canvas);

In [None]:
plt.close()

xmin, xmax = 0, 8
ymin, ymax = 0, 8

def plot_Dtrain(ax):
    xs1, ys1, xs2, ys2 = [], [], [], []
    for x, y, g in DtrainG:
        if g == "A":
            xs1.append(x)
            ys1.append(y)
        else:
            xs2.append(x)
            ys2.append(y)
    return [
        ax.scatter(xs1, ys1, c = "orange", label="A"),
        ax.scatter(xs2, ys2, c = "purple", label="B"),
    ]

def plot_predictor(ax):
    xs = [xmin, xmax]
    ys = [predict(w_best, x) for x in xs]
    return ax.plot(xs, ys, c = "black")

def plot_predictor2(ax):
    xs = [xmin, xmax]
    ys = [predict(w_gbest, x) for x in xs]
    return ax.plot(xs, ys, c = "blue")
    

plt.ioff()

fig = plt.figure(figsize=(5,5));
ax = plt.axes();
plot_Dtrain(ax)
plot_predictor(ax)
plot_predictor2(ax)
ax.set_xlabel("$x$", size=20)
ax.set_xlim((xmin, xmax + 1))
ax.set_ylabel("$y$", size=20)
ax.set_ylim((ymin, ymax + 1))
ax.legend()
ax.set_title("Predicción equitativa (azul) vs mínimos cuadrados (negro)")
fig.canvas.header_visible = False

display(fig.canvas);